egg_data <- read.csv("eggs_price_2025.csv")
colnames(egg_data) <- c("Date", "Price")
plot_ly(egg_data, x = ~as.Date(Date)) %>%
add_lines(y = ~Price, name = 'Egg prices (USD)', line = list(color = 'blue')) %>%
layout(title = "Time series plot in Egg Prices",
xaxis = list(title = "Date"),
yaxis = list(title = "Egg Prices"))Review Midterm
The questions will be the same as the sample exam, but the dataset is eggs_price data-set in lab4’s folder.
ts_egg <- ts(egg_data$Price,
start = decimal_date(as.Date(min(egg_data$Date))),
frequency = 12) #Monthly dataa
plot(ts_egg, main = "Time Series plot")Checking for stationary
f1 <- ggAcf(ts_egg, 50) +
ggtitle("ACF plot without differencing") +
theme_minimal()
f1The ACF decays slowly to 0, which is a typical sign of non-stationary. The ADF Test below indicating the same.
tseries::adf.test(ts_egg)
Augmented Dickey-Fuller Test
data: ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary
Lag plot:
gglagplot(ts_egg, do.lines = FALSE) +
ggtitle("Lag Plot of Original Egg Prices") +
theme_minimal()From lag 1-4, we are able to see a clear autocorrelation.The lag plot does not show a consistent of autocorrelation.
require(gridExtra)Loading required package: gridExtra
Attaching package: 'gridExtra'
The following object is masked from 'package:dplyr':
combine
plot1<-autoplot(ts_egg, main="without the log transformation")
plot2<-autoplot(log(ts_egg), main="log transformed data")
grid.arrange(plot1, plot2,nrow=2)There is no such big difference, so we do not need log transformation in this case.
tseries::adf.test(ts_egg)
Augmented Dickey-Fuller Test
data: ts_egg
Dickey-Fuller = -2.7064, Lag order = 8, p-value = 0.2793
alternative hypothesis: stationary
Not stationary, so take the differencing
fig1 <- ggAcf(ts_egg, 50) +
ggtitle("ACF of egg's price time series") +
theme_minimal()
fig1Non-stationary
diff <- diff(ts_egg)
fig_diff1 <- ggAcf(diff, 50) +
ggtitle("ACF For differenced egg price") +
theme_minimal()
fig_diff1tseries::adf.test(diff)Warning in tseries::adf.test(diff): p-value smaller than printed p-value
Augmented Dickey-Fuller Test
data: diff
Dickey-Fuller = -8.3505, Lag order = 8, p-value = 0.01
alternative hypothesis: stationary
Now it is stationary, second differencing is not needed.
# now we have d = 1
# for q -> ACF, acf has 2 significant spikes, so maybe q = 0, 1, 2
fig_diff_P <- ggPacf(diff, 50) +
ggtitle("PACF for differenced data") +
theme_minimal()
fig_diff_P # in this case, p = 0, 1, 2# Load necessary libraries
library(knitr)
library(kableExtra)
Attaching package: 'kableExtra'
The following object is masked from 'package:dplyr':
group_rows
library(forecast)
# Create an empty matrix to store results
results_matrix <- matrix(rep(NA, 6 * 20), nrow = 20)
# Initialize index for matrix row
i <- 1
# Loop through ARIMA model parameters: d = 1,2; p = 0:4; q = 1,2
for (d in 1:2) {
for (p in 0:2) {
for (q in 0:2) {
# Ensure 'ts_indeed' is a univariate time series by selecting the correct column
model <- Arima(ts_egg, order = c(p, d, q), include.drift = TRUE)
# Store model parameters and AIC/BIC/AICc values in matrix
results_matrix[i, ] <- c(p, d, q, model$aic, model$bic, model$aicc)
# Increment row index
i <- i + 1
}
}
}Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
Warning in Arima(ts_egg, order = c(p, d, q), include.drift = TRUE): No drift
term fitted as the order of difference is 2 or more.
# Convert matrix to data frame
results_df <- as.data.frame(results_matrix)
colnames(results_df) <- c("p", "d", "q", "AIC", "BIC", "AICc")
# Find the row with the minimum AIC
highlight_row <- which.min(results_df$AIC)
# Generate kable table with highlighting for the row with the minimum AIC
knitr::kable(results_df, align = 'c', caption = "Comparison of ARIMA Models") %>%
kable_styling(full_width = FALSE, position = "center") %>%
row_spec(highlight_row, bold = TRUE, background = "#FFFF99") # Highlight row in yellow| p | d | q | AIC | BIC | AICc |
|---|---|---|---|---|---|
| 0 | 1 | 0 | -676.7017 | -668.1222 | -676.6793 |
| 0 | 1 | 1 | -678.7610 | -665.8919 | -678.7161 |
| 0 | 1 | 2 | -681.5609 | -664.4021 | -681.4860 |
| 1 | 1 | 0 | -679.5362 | -666.6671 | -679.4914 |
| 1 | 1 | 1 | -681.2114 | -664.0526 | -681.1365 |
| 1 | 1 | 2 | -682.4048 | -660.9562 | -682.2922 |
| 2 | 1 | 0 | -682.5958 | -665.4369 | -682.5209 |
| 2 | 1 | 1 | -681.4265 | -659.9779 | -681.3139 |
| 2 | 1 | 2 | -689.1763 | -663.4380 | -689.0184 |
| 0 | 2 | 0 | -364.6668 | -360.3789 | -364.6593 |
| 0 | 2 | 1 | -668.1642 | -659.5885 | -668.1418 |
| 0 | 2 | 2 | -670.3585 | -657.4949 | -670.3135 |
| 1 | 2 | 0 | -521.5931 | -513.0174 | -521.5706 |
| 1 | 2 | 1 | -671.1789 | -658.3153 | -671.1339 |
| 1 | 2 | 2 | -673.0732 | -655.9218 | -672.9982 |
| 2 | 2 | 0 | -573.8695 | -561.0059 | -573.8246 |
| 2 | 2 | 1 | -674.4413 | -657.2898 | -674.3662 |
| 2 | 2 | 2 | -673.3408 | -651.9015 | -673.2280 |
| NA | NA | NA | NA | NA | NA |
| NA | NA | NA | NA | NA | NA |
So far, model ARIMA(2, 1, 2) is the best.
auto.arima(ts_egg)Series: ts_egg
ARIMA(4,1,1)(0,0,2)[12] with drift
Coefficients:
ar1 ar2 ar3 ar4 ma1 sma1 sma2 drift
0.9745 0.0595 -0.0021 -0.1609 -0.9473 0.1057 0.1498 0.0045
s.e. 0.0506 0.0609 0.0617 0.0459 0.0297 0.0489 0.0572 0.0028
sigma^2 = 0.0155: log likelihood = 361.64
AIC=-705.27 AICc=-704.93 BIC=-666.66
The Auto arima saying the ARIMA(4, 1, 1) is the best.
# Loop aagin
# Load necessary libraries
library(knitr)
library(kableExtra)
library(forecast)
# Create an empty matrix to store results
results_matrix <- matrix(rep(NA, 6 * 20), nrow = 20)
# Initialize index for matrix row
i <- 1
# Loop through ARIMA model parameters: d = 1,2; p = 0:4; q = 1,2
for (d in 1) {
for (p in 0:4) {
for (q in 0:2) {
# Ensure 'ts_indeed' is a univariate time series by selecting the correct column
model <- Arima(ts_egg, order = c(p, d, q), include.drift = TRUE)
# Store model parameters and AIC/BIC/AICc values in matrix
results_matrix[i, ] <- c(p, d, q, model$aic, model$bic, model$aicc)
# Increment row index
i <- i + 1
}
}
}
# Convert matrix to data frame
results_df <- as.data.frame(results_matrix)
colnames(results_df) <- c("p", "d", "q", "AIC", "BIC", "AICc")
# Find the row with the minimum AIC
highlight_row <- which.min(results_df$AIC)
# Generate kable table with highlighting for the row with the minimum AIC
knitr::kable(results_df, align = 'c', caption = "Comparison of ARIMA Models") %>%
kable_styling(full_width = FALSE, position = "center") %>%
row_spec(highlight_row, bold = TRUE, background = "#FFFF99") # Highlight row in yellow| p | d | q | AIC | BIC | AICc |
|---|---|---|---|---|---|
| 0 | 1 | 0 | -676.7017 | -668.1222 | -676.6793 |
| 0 | 1 | 1 | -678.7610 | -665.8919 | -678.7161 |
| 0 | 1 | 2 | -681.5609 | -664.4021 | -681.4860 |
| 1 | 1 | 0 | -679.5362 | -666.6671 | -679.4914 |
| 1 | 1 | 1 | -681.2114 | -664.0526 | -681.1365 |
| 1 | 1 | 2 | -682.4048 | -660.9562 | -682.2922 |
| 2 | 1 | 0 | -682.5958 | -665.4369 | -682.5209 |
| 2 | 1 | 1 | -681.4265 | -659.9779 | -681.3139 |
| 2 | 1 | 2 | -689.1763 | -663.4380 | -689.0184 |
| 3 | 1 | 0 | -682.9246 | -661.4760 | -682.8120 |
| 3 | 1 | 1 | -681.6765 | -655.9382 | -681.5186 |
| 3 | 1 | 2 | -690.9219 | -660.8939 | -690.7110 |
| 4 | 1 | 0 | -683.1827 | -657.4444 | -683.0248 |
| 4 | 1 | 1 | -695.9539 | -665.9259 | -695.7430 |
| 4 | 1 | 2 | -694.0638 | -659.7460 | -693.7921 |
| NA | NA | NA | NA | NA | NA |
| NA | NA | NA | NA | NA | NA |
| NA | NA | NA | NA | NA | NA |
| NA | NA | NA | NA | NA | NA |
| NA | NA | NA | NA | NA | NA |
ARIMA(4, 1, 2) is so far the best.
Diag:
Possible models:
ARIMA(4, 1, 1) -> Auto.arima ARIMA(2, 1, 2) -> First choice
model_output <- capture.output(sarima(ts_egg, 2, 1, 2))# Find the line numbers dynamically based on a keyword
start_line <- grep("Coefficients", model_output) # Locate where coefficient details start
end_line <- length(model_output) # Last line of output
# Print the relevant section automatically
cat(model_output[start_line:end_line], sep = "\n")Coefficients:
Estimate SE t.value p.value
ar1 0.7828 0.0584 13.4153 0.0000
ar2 -0.8377 0.0940 -8.9131 0.0000
ma1 -0.7742 0.0265 -29.2028 0.0000
ma2 0.9338 0.0827 11.2940 0.0000
constant 0.0061 0.0060 1.0224 0.3071
sigma^2 estimated as 0.0159264 on 534 degrees of freedom
AIC = -1.27862 AICc = -1.278411 BIC = -1.230868
model_output <- capture.output(sarima(ts_egg, 4, 1, 1))# Find the line numbers dynamically based on a keyword
start_line <- grep("Coefficients", model_output) # Locate where coefficient details start
end_line <- length(model_output) # Last line of output
# Print the relevant section automatically
cat(model_output[start_line:end_line], sep = "\n")Coefficients:
Estimate SE t.value p.value
ar1 0.9655 0.0554 17.4269 0.0000
ar2 0.0288 0.0604 0.4770 0.6336
ar3 -0.0007 0.0615 -0.0108 0.9914
ar4 -0.1464 0.0460 -3.1832 0.0015
ma1 -0.9158 0.0363 -25.2417 0.0000
constant 0.0049 0.0030 1.6049 0.1091
sigma^2 estimated as 0.01567457 on 533 degrees of freedom
AIC = -1.291195 AICc = -1.290902 BIC = -1.235484
Best model should be 2, 1, 2.
fit <- Arima(ts_egg, order = c(2, 1, 2), include.drift = FALSE)
summary(fit)Series: ts_egg
ARIMA(2,1,2)
Coefficients:
ar1 ar2 ma1 ma2
0.7836 -0.8356 -0.7742 0.9322
s.e. 0.0617 0.1029 0.0270 0.0915
sigma^2 = 0.01608: log likelihood = 350.07
AIC=-690.13 AICc=-690.02 BIC=-668.68
Training set error measures:
ME RMSE MAE MPE MAPE MASE
Training set 0.005541111 0.1262059 0.0776439 0.04752214 5.256716 0.3163514
ACF1
Training set 0.06016651
# Forecast the next 365 periods
forecast_result <- forecast(fit, h = 365)
# Display forecast accuracy
accuracy(forecast_result) ME RMSE MAE MPE MAPE MASE
Training set 0.005541111 0.1262059 0.0776439 0.04752214 5.256716 0.3163514
ACF1
Training set 0.06016651
# Plot the forecast
autoplot(forecast_result) +
labs(title = "ARIMA(2,1,2) Forecast",
x = "Time",
y = "Predicted Values") +
theme_minimal()# Fit models individually and check residuals
f_mean <- meanf(ts_egg, h = 200)
head(f_mean$mean) #gives the forecasted values Jan Feb Mar Apr May Jun
2025 1.37203 1.37203 1.37203 1.37203 1.37203 1.37203
mean(ts_egg)[1] 1.37203
f_naive <- naive(ts_egg, h = 200)
#checkresiduals(f_naive)
f_drift <- rwf(ts_egg, drift = TRUE, h = 200)
checkresiduals(f_drift)
Ljung-Box test
data: Residuals from Random walk with drift
Q* = 73.623, df = 24, p-value = 6.101e-07
Model df: 0. Total lags used: 24
# Generate forecasts
mean_forecast <- meanf(ts_egg, h = 365)
naive_forecast <- naive(ts_egg, h = 365)
drift_forecast <- rwf(ts_egg, drift = TRUE, h = 365)
arima_forecast <- forecast(fit, h = 365)
mean_df <- data.frame(Date = time(mean_forecast$mean), Mean = as.numeric(mean_forecast$mean))
naive_df <- data.frame(Date = time(naive_forecast$mean), Naive = as.numeric(naive_forecast$mean))
drift_df <- data.frame(Date = time(drift_forecast$mean), Drift = as.numeric(drift_forecast$mean))
arima_df <- data.frame(Date = time(arima_forecast$mean), ARIMA_Fit = as.numeric(arima_forecast$mean))
ts_egg_df <- data.frame(Date = time(ts_egg), Price = as.numeric(ts_egg))
# Create Plotly plot
plot_ly() %>%
add_lines(data = ts_egg_df, x = ~Date, y = ~Price, name = 'Original Data', line = list(color = 'black')) %>%
add_lines(data = mean_df, x = ~Date, y = ~Mean, name = 'Mean Forecast', line = list(color = 'blue')) %>%
add_lines(data = naive_df, x = ~Date, y = ~Naive, name = 'Naïve Forecast', line = list(color = 'red')) %>%
add_lines(data = drift_df, x = ~Date, y = ~Drift, name = 'Drift Forecast', line = list(color = 'green')) %>%
add_lines(data = arima_df, x = ~Date, y = ~ARIMA_Fit, name = 'ARIMA Fit', line = list(color = 'purple')) %>%
layout(title = 'Job Postings Forecast',
xaxis = list(title = 'Date'),
yaxis = list(title = 'Number of Job Postings'),
legend = list(title = list(text = 'Forecast Methods')))